Data Source: This analysis utilizes the social_media_viral_content_dataset dataset, sourced from Kaggle by Ali Hussain. It contains simulated/real cross-platform social media metrics to evaluate content format performance.
## Rows: 2000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): post_id, platform, content_type, topic, language, region, hashtags
## dbl (7): views, likes, comments, shares, engagement_rate, sentiment_score, ...
## dttm (1): post_datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 2. Clean the data
clean_social_data <- raw_social_data %>%
clean_names() %>%
distinct() %>%
drop_na(views, likes, shares, content_type) %>%
# Adding a calculated column for deep engagement metrics
mutate(engagement_rate = (likes + comments + shares) / views)
# 3. Verify
glimpse(clean_social_data)## Rows: 2,000
## Columns: 15
## $ post_id <chr> "SM_100000", "SM_100001", "SM_100002", "SM_100003", "S…
## $ platform <chr> "Instagram", "Instagram", "YouTube Shorts", "X", "YouT…
## $ content_type <chr> "text", "carousel", "video", "text", "text", "carousel…
## $ topic <chr> "Sports", "Sports", "Technology", "Politics", "Educati…
## $ language <chr> "ur", "ur", "ur", "ur", "es", "hi", "hi", "es", "es", …
## $ region <chr> "UK", "Brazil", "UK", "US", "US", "Brazil", "UK", "Pak…
## $ post_datetime <dttm> 2024-12-10, 2024-10-13, 2024-05-03, 2024-08-04, 2024-…
## $ hashtags <chr> "#tech #funny #music", "#news #fyp #funny #ai #trendin…
## $ views <dbl> 2319102, 2538464, 1051176, 5271440, 3186256, 6513472, …
## $ likes <dbl> 122058, 110368, 87598, 329465, 199141, 465248, 2847, 3…
## $ comments <dbl> 15800, 11289, 47196, 774, 5316, 27485, 194, 31556, 145…
## $ shares <dbl> 861, 54887, 44132, 59736, 83105, 25659, 84655, 11395, …
## $ engagement_rate <dbl> 0.05981583, 0.06954757, 0.17021507, 0.07397884, 0.0902…
## $ sentiment_score <dbl> 0.464, -0.800, 0.416, 0.877, 0.223, -0.907, -0.235, 0.…
## $ is_viral <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, …
# 1. Aggregate performance metrics to identify baseline 'Format DNA'
content_performance <- clean_social_data %>%
group_by(content_type) %>%
summarize(
total_posts = n(),
avg_views = mean(views),
avg_likes = mean(likes),
avg_engagement_rate = mean(engagement_rate),
viral_count = sum(is_viral)
) %>%
# Calculate what percentage of posts for each format went viral
mutate(viral_percentage = (viral_count / total_posts) * 100) %>%
mutate(across(where(is.numeric), ~round(., 2))) %>%
# Sort the results from highest engagement rate to lowest
arrange(desc(avg_engagement_rate))
# 2. Print the results as a sortable table
datatable(content_performance,
style = 'bootstrap',
colnames = c("Format", "Total Posts", "Avg Views", "Avg Likes", "Avg Engagement", "Viral Count", "Viral %"),
options = list(dom = 't', ordering = TRUE, paging = FALSE),
rownames = FALSE
) %>%
# This adds commas for thousands AND rounds to 2 decimals
formatRound(columns = c(2, 3, 4, 5, 7), digits = 2) %>%
# For engagement, maybe show 3 decimals so the difference is clearer
formatRound(columns = 5, digits = 3)
The Insight: Engagement spikes significantly on
Mondays and remains strong through Tuesday, with a secondary peak on
weekends. This aligns with users transitioning back to their weekly
routines and planning their at-home activities after a busy weekend.
The Strategy: By capitalizing on this
“return-to-routine” behavior, the platform can target its highest-value
planning tools and lifestyle content on Sundays and Mondays—exactly when
users are at home and establishing their schedules for the week.
# 1. Prep the data: Extract Day of the Week and calculate average engagement
day_data <- clean_social_data %>%
mutate(
# Extract the day of the week (Monday, Tuesday, etc.)
day_of_week = wday(post_datetime, label = TRUE, abbr = FALSE)
) %>%
group_by(day_of_week) %>%
summarize(
avg_engagement = mean(engagement_rate),
total_posts = n()
)
# 2. Build the visual
day_plot <- ggplot(day_data, aes(x = day_of_week,
y = avg_engagement,
fill = day_of_week,
text = paste("Day:", day_of_week,
"<br>Avg Engagement:", round(avg_engagement, 4),
"<br>Total Posts:", total_posts))) +
# Create a clean bar chart
geom_col(show.legend = FALSE, alpha = 0.9) +
# Use a sleek, modern color palette (viridis 'mako' looks very premium)
scale_fill_viridis_d(option = "mako") +
theme_minimal() +
labs(
title = "Engagement Trends by Day of the Week",
subtitle = "Identifying peak traffic days for product launch",
x = "",
y = "Average Engagement Rate"
) +
# Remove the background grid for a cleaner dashboard look
theme(
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)
)
# 3. Render it as an interactive widget
ggplotly(day_plot, tooltip = "text")The Insight: Different formats serve entirely
different user behaviors. Reach and sharing are heavily driven by video,
while direct interaction (comments and likes) is driven by text and
images.
The Strategy: The interface cannot treat
all content equally. The UI must be modular, offering a high-velocity,
full-screen feed for quick video discovery, alongside focused,
distraction-free layouts for text-based community discussions.
# 1. Prep and scale the data (0 to 100)
radar_format <- clean_social_data %>%
group_by(content_type) %>%
summarize(
Views = mean(views),
Likes = mean(likes),
Comments = mean(comments),
Shares = mean(shares),
Engagement = mean(engagement_rate)
) %>%
mutate(across(-content_type, ~ rescale(., to = c(0, 100))))
# 2. Extract stats (we add the first stat to the end to close the shape)
categories <- c("Views", "Likes", "Comments", "Shares", "Engagement", "Views")
video_stats <- c(as.numeric(radar_format[radar_format$content_type == "video", -1]), as.numeric(radar_format[radar_format$content_type == "video", 2]))
text_stats <- c(as.numeric(radar_format[radar_format$content_type == "text", -1]), as.numeric(radar_format[radar_format$content_type == "text", 2]))
image_stats <- c(as.numeric(radar_format[radar_format$content_type == "image", -1]), as.numeric(radar_format[radar_format$content_type == "image", 2]))
# 3. Build the plot
plot_ly(type = 'scatterpolar', fill = 'toself', mode = 'lines+markers') %>%
add_trace(r = video_stats, theta = categories, name = 'Video', fillcolor = 'rgba(255, 165, 0, 0.3)', line = list(color = 'darkorange')) %>%
add_trace(r = text_stats, theta = categories, name = 'Text', fillcolor = 'rgba(0, 191, 255, 0.3)', line = list(color = 'deepskyblue')) %>%
add_trace(r = image_stats, theta = categories, name = 'Image', fillcolor = 'rgba(50, 205, 50, 0.3)', line = list(color = 'limegreen')) %>%
layout(polar = list(radialaxis = list(visible = TRUE, range = c(0, 100))), title = "Content Format DNA: Engagement vs. Reach")The Insight: Mapping engagement across legacy
platforms reveals a polarized landscape: highly saturated “hotspots” of
intense user attention, alongside distinct “cold zones” where specific
formats are neglected.
The Strategy: This dictates
a two-pronged launch plan. We will design our core platform features to
capture the underserved “cold zones” for product differentiation, while
concentrating our initial user acquisition and advertising spend
directly within the proven “hotspots” on legacy networks.
# 1. Prep the Data: Group by BOTH Platform and Format
matrix_data <- clean_social_data %>%
group_by(platform, content_type) %>%
summarize(avg_engagement = mean(engagement_rate), .groups = "drop")
# 2. Build the Heatmap
heatmap_plot <- ggplot(matrix_data, aes(x = content_type,
y = platform,
fill = avg_engagement,
text = paste("Platform:", platform,
"<br>Format:", content_type,
"<br>Avg Engagement:", round(avg_engagement, 3)))) +
# geom_tile creates the grid squares; color = "white" adds a clean border between them
geom_tile(color = "white", linewidth = 0.5) +
# 'plasma' is a fantastic, high-contrast dark-to-light color scale
scale_fill_viridis_c(option = "plasma", name = "Engagement\nRate") +
theme_minimal() +
labs(
title = "The Competitive Landscape: Engagement Hotspots",
subtitle = "Identifying market gaps for Kitchenship UX planning",
x = "Content Format",
y = "" # Leaving Y blank since the platform names speak for themselves
) +
# Remove the background grid lines for a flush, premium look
theme(
panel.grid = element_blank(),
axis.text.y = element_text(hjust = 1)
)
# 3. Make it interactive
ggplotly(heatmap_plot, tooltip = "text")The Insight: While isolating the overarching
“Lifestyle” category provides a valuable baseline for US demographic
behavior, it also highlights the noise and limitations of broad
categorization on legacy platforms.
The Strategy:
A higher level of granular detail is required for our product. We must
develop a “broader span of a narrower niche,” building a hyper-specific,
proprietary taxonomy to seed our search architecture and auto-suggestion
UI, rather than relying on generic macro-tags.
# 1. Filter for the Kitchenship Target Audience
kitchenship_cohort <- clean_social_data %>%
filter(topic == "Lifestyle", region == "US") %>%
# Drop rows where hashtags are missing
filter(!is.na(hashtags))
# 2. Extract and calculate top hashtags
top_hashtags <- kitchenship_cohort %>%
# Some rows might have "#food #life". This splits them into separate rows so we can count them
separate_rows(hashtags, sep = " ") %>%
# Clean up the text (remove the '#' symbol and make everything lowercase so #Food and #food match)
mutate(clean_tag = str_remove_all(str_to_lower(hashtags), "#")) %>%
filter(clean_tag != "") %>% # Remove any blank spaces
group_by(clean_tag) %>%
summarize(
total_uses = n(),
avg_engagement = mean(engagement_rate)
) %>%
# Only look at tags that have been used a decent amount of times to avoid one-hit wonders
filter(total_uses >= 3) %>%
arrange(desc(avg_engagement)) %>%
head(10) # Grab the top 10
# 3. Build the Visual
hashtag_plot <- ggplot(top_hashtags, aes(x = reorder(clean_tag, avg_engagement),
y = avg_engagement,
fill = avg_engagement,
text = paste("Tag: #", clean_tag,
"<br>Uses:", total_uses,
"<br>Avg Engagement:", round(avg_engagement, 4)))) +
geom_col(show.legend = FALSE) +
coord_flip() + # Flips the chart sideways so the words are easy to read
scale_fill_viridis_c(option = "magma") +
theme_minimal() +
labs(
title = "Top Hashtags in US Lifestyle Content",
subtitle = "Recommended default tags for the Kitchenship upload UI",
x = "Hashtag",
y = "Average Engagement Rate"
)+
theme(
# Center the title (0.5 is the middle) and make it bold
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
# Center the subtitle as well for a balanced look
plot.subtitle = element_text(hjust = 0.5, size = 10, margin = margin(b = 10)),
# Clean up the grid
panel.grid.major.y = element_blank()
)
# 4. Make it interactive
ggplotly(hashtag_plot, tooltip = "text")
The insights gathered from this analysis provide the foundational “Blueprints” for our product’s ecosystem. While the data confirms that high-reach video and high-depth text are our primary engines, the true success of the platform will lie in the User Experience that bridges them.
Key Strategic Pillars for Next Phase:
* Infrastructure Over
Intuition: We will move away from generic “Lifestyle” tags and begin
building a proprietary, hyper-granular taxonomy that serves our specific
niche.
* Routine-Based UX: Design development will focus on the
Sunday/Monday “planning” window, ensuring our most valuable tools are
front-and-center when user intent is highest.
* Competitive
Differentiation: By focusing our feature development on the “cold zones”
identified in the market gap analysis, we ensure our platform offers a
unique value proposition that legacy networks cannot easily
replicate.
Next Step: Transition these data-backed requirements into the high-fidelity UI/UX design phase to begin prototyping the core user journeys.